No Hyperparam Tuning¶
In [ ]:
import numpy as np
from utilities.dataset_jlb import CityDataset
cities = ["Aachen", 'London', 'CapeTown', 'Hamburg', 'Johannesburg', 'London', 'Montreal', 'Paris', 'Seoul', 'Singapore', 'Sydney']
In [ ]:
# from data_acquisition import DataHandler
# from utilities.utils import setup_logger
# logger = setup_logger(level="ERROR")
# dh = DataHandler(logger, "data")
# BerlinTest = dh.get_building_mask("BerlinTest", all_touched=True)
In [ ]:
# check if the mask is correct
import rasterio
from utilities.plot_utils import (plot_band_with_mask,
histogram_scaler_bands,
describe_tif,
plot_bands)
b = {
"B04": 0,
"B03": 1,
"B02": 2,
"B08": 3,
"B12": 4,
"B11": 5,
}
with rasterio.open("/home/jlb/Projects/architecture-of-ml-systems/data/BerlinTest/openEO.tif") as src:
describe_tif(src)
data = src.read([1,2,3,4,5,6])
with rasterio.open("/home/jlb/Projects/architecture-of-ml-systems/data/BerlinTest/building_mask_dense.tif") as src:
describe_tif(src)
labels = src.read(1)
data = histogram_scaler_bands(data, 1.0, 99.0)
plot_bands(data, bands=[b["B04"], b["B03"], b["B02"]], title="BerlinTest")
plot_band_with_mask(data[b["B04"]], labels, title="BerlinTest")
Profile:
{'driver': 'GTiff', 'dtype': 'int16', 'nodata': -32768.0, 'width': 1427, 'height': 1361, 'count': 6, 'crs': CRS.from_epsg(32633), 'transform': Affine(10.0, 0.0, 384100.0,
0.0, -10.0, 5826300.0), 'blockxsize': 512, 'blockysize': 512, 'tiled': True, 'compress': 'deflate', 'interleave': 'band'}
SHAPE: (1361, 1427)
dtype int16
max 10608
min 98
mean 772.9615595523923
std 477.7241897203279
sum 1501204974
Profile:
{'driver': 'GTiff', 'dtype': 'int16', 'nodata': -32768.0, 'width': 1427, 'height': 1361, 'count': 1, 'crs': CRS.from_epsg(32633), 'transform': Affine(10.0, 0.0, 384100.0,
0.0, -10.0, 5826300.0), 'blockysize': 2, 'tiled': False, 'interleave': 'band'}
SHAPE: (1361, 1427)
dtype int16
max 1
min 0
mean 0.40554036331956334
std 0.4909963106156703
sum 787619
p_down, p_up [200. 296. 232. 288. 219. 313.] [2545. 2351. 2146. 4188. 2732. 3103.]
In [ ]:
dataset = CityDataset("/home/jlb/Projects/architecture-of-ml-systems/data",
patch_size=16,
data_name="openEO.tif",
labels_name="building_mask_dense.tif",
image_bands=[1,2,3,4,5,6],
min_labels=0.1,
cities=cities,
train=True,)
dataset_test = CityDataset("/home/jlb/Projects/architecture-of-ml-systems/data",
data_name="openEO.tif",
labels_name="building_mask_dense.tif",
image_bands=[1,2,3,4,5,6],
cities=["BerlinTest"],
train=False)
Loading data from cities: ['Singapore', 'Johannesburg', 'London', 'Montreal', 'Seoul', 'Aachen', 'CapeTown', 'Hamburg', 'Paris', 'Sydney']
Loading Images: 0%| | 0/10 [00:00<?, ?it/s]
Loading Labels: 0%| | 0/10 [00:00<?, ?it/s]
Creating Patches from Images: 0it [00:00, ?it/s]
Loading data from cities: ['BerlinTest']
Loading Images: 0%| | 0/1 [00:00<?, ?it/s]
Loading Labels: 0%| | 0/1 [00:00<?, ?it/s]
In [ ]:
# it is possible to update the patch size without reloading the dataset
dataset.update_patch_size(32) # 32x32 patches
Creating Patches from Images: 0it [00:00, ?it/s]
In [ ]:
# torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# from torch.utils.tensorboard import SummaryWriter
import lightning as L
from typing import Any
from torch.utils.data import DataLoader, Dataset
from lightning import seed_everything
# batch size 8 with 16x16 patches GPU memory: 36%, 117 it/s
# batch size 16 with 16x16 patches GPU memory: 37%, 117 it/s
# batch size 32 with 16x16 patches GPU memory: 40%, 90 it/s
batch_size = 32
train_dataset, val_dataset = dataset.train_val_split(val_size=0.1,
n_groups=100,
random_state=42,
show_summary=True)
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=20)
val_dl = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=20)
print(f"Len total dataset: {len(dataset)}")
print(f"len train dataset: {len(train_dataset)}")
print(f"len val dataset: {len(val_dataset)}")
sample_train = next(iter(train_dl))
sample_val = next(iter(val_dl))
print("Training Sample (data, labels)",sample_train["data"].shape, sample_train["labels"].shape)
print("Validation Sample (data, labels)", sample_val["data"].shape, sample_val["labels"].shape)
print("Len train", len(train_dl))
print("Len val", len(val_dl))
Train: Number of samples: 43038 Shape of Train data (data, label) (6, 32, 32) (32, 32) Val: Number of samples: 4783 Shape of Val data (data, label) (6, 32, 32) (32, 32) ************************************************** Mean percentage of 1 labels in train: 0.3563743586699545 Mean percentage of 1 labels in val: 0.3564508251881664 Mean percentage of 1 labels in all data: 0.35638200676088955 ************************************************** Std of percentage 1 labels in train: 0.18495708352186507 Std of percentage 1 labels in val: 0.18479195943286666 Std of percentage 1 labels in all data: 0.18494057606377182 ************************************************** Min percentage of 1 labels in train: 0.1005859375 Min percentage of 1 labels in val: 0.1005859375 Min percentage of 1 labels in all data: 0.1005859375 ************************************************** Max percentage of 1 labels in train: 0.9814453125 Max percentage of 1 labels in val: 0.98828125 Max percentage of 1 labels in all data: 0.98828125 ************************************************** Len total dataset: 47821 len train dataset: 43038 len val dataset: 4783 Training Sample (data, labels) torch.Size([32, 6, 32, 32]) torch.Size([32, 32, 32]) Validation Sample (data, labels) torch.Size([32, 6, 32, 32]) torch.Size([32, 32, 32]) Len train 1345 Len val 150
In [ ]:
test_dl = DataLoader(dataset_test, batch_size=1, shuffle=False, num_workers=20)
sample_test = next(iter(test_dl))
print(sample_test["data"].shape, sample_test["labels"].shape)
torch.Size([1, 6, 1361, 1427]) torch.Size([1, 1361, 1427])
Load Model¶
In [ ]:
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks.model_checkpoint import ModelCheckpoint
from utilities.Lightning_utils import LitModule
from utilities.Lightning_utils import ConvNetSimple
# model
convmodel = LitModule(ConvNetSimple(channels=6))
# trainer
def get_trainer(directory):
trainer = L.Trainer(
default_root_dir=f"models/exp01/{directory}",
callbacks=[
EarlyStopping(
monitor="val_loss",
mode="min",
patience=2,
),
ModelCheckpoint(
monitor="val_loss",
mode="min",
save_top_k=2,
dirpath=f"models/exp01/{directory}",
filename="best_model"
)
],
# val_check_interval=1,
fast_dev_run=False,
num_sanity_val_steps=2,
max_epochs=100,
log_every_n_steps=20,
)
return trainer
Tune LR¶
In [ ]:
from lightning.pytorch.tuner.tuning import Tuner
In [ ]:
trainer = get_trainer("ConvNetSimple")
seed_everything(49)
tuner = Tuner(trainer=trainer)
torch.set_float32_matmul_precision('high') # for tensor cores
tuner.lr_find(convmodel, train_dl, val_dl, min_lr=1e-6, max_lr=0.01, num_training=5000)
# good lr: 0.001174897554939528
GPU available: True (cuda), used: True TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs Seed set to 49
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Finding best initial lr: 0%| | 0/5000 [00:00<?, ?it/s]
`Trainer.fit` stopped: `max_steps=5000` reached. Learning rate set to 0.0016143585568264868 Restoring states from the checkpoint path at models/exp01/ConvNetSimple/.lr_find_a760014e-991e-4f02-b8d7-9a47d7432101.ckpt Restored all states from the checkpoint at models/exp01/ConvNetSimple/.lr_find_a760014e-991e-4f02-b8d7-9a47d7432101.ckpt
Out[Â ]:
<lightning.pytorch.tuner.lr_finder._LRFinder at 0x7fecbf4f0d90>
Train Model¶
In [ ]:
# # training
trainer.fit(convmodel,
train_dataloaders=train_dl,
val_dataloaders=val_dl
)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
| Name | Type | Params | Mode ------------------------------------------------ 0 | model | ConvNetSimple | 94.2 K | train 1 | loss | BCELoss | 0 | train ------------------------------------------------ 94.2 K Trainable params 0 Non-trainable params 94.2 K Total params 0.377 Total estimated model params size (MB)
Sanity Checking: | | 0/? [00:00<?, ?it/s]
Training: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
In [ ]:
best_model_conv = LitModule.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
trainer.test(model=best_model_conv, dataloaders=test_dl)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Testing: | | 0/? [00:00<?, ?it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Test metric DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
test_loss_epoch 0.38404497504234314
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Out[Â ]:
[{'test_loss_epoch': 0.38404497504234314}]
In [ ]:
# plot the output of the test
import matplotlib.pyplot as plt
sample = next(iter(test_dl))
print(f"Sample shape: {sample['data'].shape}")
# convmodel.eval()
# output = convmodel(sample["data"])
# print(output.shape)
prediction = trainer.predict(model=best_model_conv, dataloaders=test_dl)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Sample shape: torch.Size([1, 6, 1361, 1427])
Predicting: | | 0/? [00:00<?, ?it/s]
In [ ]:
print(prediction[0].shape)
output = prediction[0].detach().numpy()
output = output.squeeze()
print(output.shape)
# output
torch.Size([1, 1, 1361, 1427]) (1361, 1427)
In [ ]:
from utilities.plot_utils import (
plot_prediction_with_thresholds,
plot_random_patch,
plot_output
)
plot_output(output)
plot_random_patch(output, patch_len=6)
plot_prediction_with_thresholds(output)
[[0.01106262 0.01344627 0.01856582 0.02052435 0.04994096 0.0941106 ] [0.0394295 0.04609582 0.0433219 0.04410533 0.06069243 0.06888498] [0.02360394 0.02088859 0.01290968 0.01555219 0.02547724 0.03814513] [0.03192084 0.0317886 0.02858587 0.02829278 0.04094206 0.0914407 ] [0.18983711 0.21338607 0.17830016 0.08393327 0.12551358 0.23672046] [0.32559654 0.32085553 0.39692518 0.28868127 0.34046698 0.36366984]]
Model U-Net¶
In [ ]:
# import torchmetrics
# from torchmetrics import Dice
from utilities.unet import UNet
# model
unet = UNet(n_channels=len(dataset.get_image_bands()), n_classes=1, bilinear=True)
# UNet implementation uses the BCEWithLogitsLoss, lr of 1e-5 default
unet_lit = LitModule(unet, learning_rate=1e-4, loss=nn.BCEWithLogitsLoss())
torch.set_float32_matmul_precision('high')
unet_trainer = get_trainer("unet")
seed_everything(49)
# lr finder
tuner = Tuner(unet_trainer)
tuner.lr_find(unet_lit, train_dl, val_dl, min_lr=1e-6, max_lr=0.01, num_training=5000)
# good lr: 0.0031915378551007614
unet_trainer.fit(unet_lit,
train_dataloaders=train_dl,
val_dataloaders=val_dl
)
GPU available: True (cuda), used: True TPU available: False, using: 0 TPU cores HPU available: False, using: 0 HPUs Seed set to 49 Missing logger folder: models/exp01/unet/lightning_logs /home/jlb/Projects/architecture-of-ml-systems/.venv/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:652: Checkpoint directory /home/jlb/Projects/architecture-of-ml-systems/models/exp01/unet exists and is not empty. LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Finding best initial lr: 0%| | 0/5000 [00:00<?, ?it/s]
`Trainer.fit` stopped: `max_steps=5000` reached. Learning rate set to 0.0008534930716135486 Restoring states from the checkpoint path at models/exp01/unet/.lr_find_59fc8274-d55b-45bb-b76a-705e2fcc05f6.ckpt Restored all states from the checkpoint at models/exp01/unet/.lr_find_59fc8274-d55b-45bb-b76a-705e2fcc05f6.ckpt LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0] | Name | Type | Params | Mode ---------------------------------------------------- 0 | model | UNet | 17.3 M | train 1 | loss | BCEWithLogitsLoss | 0 | train ---------------------------------------------------- 17.3 M Trainable params 0 Non-trainable params 17.3 M Total params 69.059 Total estimated model params size (MB)
Sanity Checking: | | 0/? [00:00<?, ?it/s]
Training: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
Validation: | | 0/? [00:00<?, ?it/s]
In [ ]:
best_model = LitModule.load_from_checkpoint(unet_trainer.checkpoint_callback.best_model_path)
unet_trainer.test(model=best_model, dataloaders=test_dl)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Testing: | | 0/? [00:00<?, ?it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Test metric DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
test_loss_epoch 0.35365039110183716
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Out[Â ]:
[{'test_loss_epoch': 0.35365039110183716}]
In [ ]:
# unet_trainer.test(model=unet_lit, dataloaders=test_dl)
In [ ]:
prediction = unet_trainer.predict(model=best_model, dataloaders=test_dl)
# prediction = unet_trainer.predict(model=unet_lit, dataloaders=test_dl)
print(prediction[0].shape)
# prediction = F.interpolate(prediction[0], (output.shape[2], output.shape[3]), mode="bilinear")
output = torch.sigmoid(prediction[0]).detach().numpy()
# output = prediction[0].detach().numpy()
print(output.shape)
output = output.squeeze()
print(output.shape)
# output
plot_output(output)
plot_random_patch(output, patch_len=6)
plot_prediction_with_thresholds(output)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting: | | 0/? [00:00<?, ?it/s]
torch.Size([1, 1, 1361, 1427]) (1, 1, 1361, 1427) (1361, 1427)
[[0.03297826 0.03040841 0.03067048 0.0410015 0.0494289 0.04906508] [0.05410413 0.04744549 0.06491446 0.08485103 0.100301 0.08138527] [0.03333778 0.03750996 0.04904346 0.05081847 0.0359347 0.02989559] [0.06588066 0.07073054 0.07238087 0.06794988 0.05650675 0.07369687] [0.22064649 0.17509465 0.1439425 0.13441636 0.13590924 0.13892187] [0.2591825 0.19466874 0.16221802 0.13725671 0.11017903 0.12332146]]